library("DESeq2") # package for DESeq run
library("combinat")
library("BiocParallel") # for parallel evaluation
library("dplyr") # for data compilation
library("ggplot2") # plotting
library("vsn") #
library("pheatmap") # for heatmap plotting
library("RColorBrewer") # provides several color palette
# read in counts dataset as a matrix
cts <- as.matrix(read.csv("PC9_expression_matrix.csv", row.names = 1))
# read in annotation dataset, and use the first column as rownames
coldata <- read.csv("colData.csv", row.names=1)
# subset 2 columns: condition and type
coldata <- coldata[,c("file_id", "condition", "read_type")]
# convert type of items in condition into 'factor' type.
coldata$condition <- factor(coldata$condition)
# convert type of items in type into 'factor' type.
coldata$read_type <- factor(coldata$read_type)
# remove the character 'fb' in rownames of coldata
rownames(coldata) <- coldata$file_id
# subset the cts by column names included in the row names of 'coldata', and reorder them in order of coldata.
cts <- cts[, rownames(coldata)]
cts <- round(cts)
# generate metadata for multi-comparison
metadata <- data.frame(Untreated = coldata$file_id[1:3],
Day3 = coldata$file_id[4:6],
Day9 = coldata$file_id[7:9])
# run mult-
multi <- DESeq_multi_group(exprSet = cts, meta = metadata, coldata = coldata)
## converting counts to integer mode
## factor levels were dropped which had no samples
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## converting counts to integer mode
## factor levels were dropped which had no samples
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## converting counts to integer mode
## factor levels were dropped which had no samples
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
dds_list <- multi$dds_list
results_list <- multi$results_list
resLFC_list <- multi$resLFC_list
–>
for(i in 1:length(results_list)){
plotMA(results_list[[i]], ylim=c(-2,2)) # MA-plot
plotMA(resLFC_list[[i]], ylim=c(-2,2)) # MA-plot for log FC data
}
# interaction with MAplot
# idx <- identify(res$baseMean, res$log2FoldChange) # get to identify MA-plot
# rownames(res)[idx]
# because we are interested in treated vs untreated, we set 'coef=2'
resNorm_list <- list()
resApm_list <- list()
resAsh_list <- list()
for(i in 1:length(results_list)){
resNorm <- lfcShrink(dds_list[[i]], coef=2, type="normal") # using 'normal' for LFC shrinkage
resNorm_list[[names(results_list[i])]] <- resNorm
resApm <- lfcShrink(dds_list[[i]], coef=2, type="apeglm") # using 'normal' for LFC shrinkage
resApm_list[[names(results_list[i])]] <- resApm
resAsh <- lfcShrink(dds_list[[i]], coef=2, type="ashr") # using 'ashr' for LFC shrinkage, by which the bias could be less than normal
resAsh_list[[names(results_list[i])]] <- resAsh
}
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
# plot 3 types respectively
for(i in 1:length(results_list)){
par(mfrow=c(1,3), mar=c(4,4,2,1)) # set margin size and organization
xlim <- c(1,1e5); ylim <- c(-3,3) # set x/y-axis limits
# plot 3 MA plots
plotMA(resApm_list[[i]], xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm_list[[i]], xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh_list[[i]], xlim=xlim, ylim=ylim, main="ashr")
}
for(i in 1:length(results_list)){
plotCounts(dds_list[[i]], gene=which.min(results_list[[i]]$padj), intgroup="condition") # plot count
d <- plotCounts(dds_list[[i]], gene=which.min(results_list[[i]]$padj), intgroup="condition",
returnData=TRUE)
# use ggolot to demonstrate it
ggplot(d, aes(x=condition, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
scale_y_log10(breaks=c(25,100,400))
}
# More information on results columns
mcols(results_list[[i]])$description
## [1] "mean of normalized counts for all samples"
## [2] "log2 fold change (MLE): condition Day9 vs Day3"
## [3] "standard error: condition Day9 vs Day3"
## [4] "Wald statistic: condition Day9 vs Day3"
## [5] "Wald test p-value: condition Day9 vs Day3"
## [6] "BH adjusted p-values"
vsd_list <- list()
rld_list <- list()
ntd_list <- list()
# Count data transformations
for(i in 1:length(results_list)){
vsd <- vst(dds_list[[i]], blind=FALSE) # performing variance stabilizing transformations
rld <- rlog(dds_list[[i]], blind=FALSE) # Apply regularized log transformation
ntd <- normTransform(dds_list[[i]])
head(assay(vsd), 3) # show top 3 rows
vsd_list[[names(results_list[i])]] <- vsd
rld_list[[names(results_list[i])]] <- rld
ntd_list[[names(results_list[i])]] <- ntd
}
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
library("vsn")
# this gives log2(n + 1)
for(i in 1:length(results_list)){
meanSdPlot(assay(ntd_list[[i]])) # plot normal transformation
meanSdPlot(assay(vsd_list[[i]])) # plot variance stabilizing transformations
meanSdPlot(assay(rld_list[[i]])) # plot r-log
}
## Data quality assessment by sample clustering and visualization
# Heatmap of the count matrix
for(i in 1:length(results_list)){
select <- order(rowMeans(counts(dds_list[[i]],normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds_list[[i]])[,c("condition","read_type")])
# heatmap for normal transformation
pheatmap(assay(ntd_list[[i]])[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
# heatmap for variance stabilizing transformations
pheatmap(assay(vsd_list[[i]])[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
# heatmap for r-log
pheatmap(assay(rld_list[[i]])[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
}
for(i in 1:length(results_list)){
## plot heatmap of the sample-to-sample distances
sampleDists <- dist(t(assay(vsd_list[[i]]))) # get distances
# make it matrix
sampleDistMatrix <- as.matrix(sampleDists)
# data preparation
rownames(sampleDistMatrix) <- paste(vsd_list[[i]]$condition, vsd_list[[i]]$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) # set colors
# plot heatmap for distances
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
}
# plot PCA
plotPCA(vsd_list[[1]], intgroup=c("condition", "read_type"))
## using ntop=500 top features by variance
plotPCA(vsd_list[[2]], intgroup=c("condition", "read_type"))
## using ntop=500 top features by variance
plotPCA(vsd_list[[3]], intgroup=c("condition", "read_type"))
## using ntop=500 top features by variance
# customize your PCA plot with ggplot2
for(i in 1:length(results_list)){
pcaData <- plotPCA(vsd_list[[i]], intgroup=c("condition", "read_type"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition, shape=type)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
}
## using ntop=500 top features by variance
## using ntop=500 top features by variance
## using ntop=500 top features by variance